library(stm)
library(ggplot2)
library(tidyverse)
library(magrittr)
library(sandwich) 
library(lmtest)
library(stargazer)
library(dotwhisker)

############ TEST 1: Weiboscope data

load(file = "data/weibo.Rdata")
load(file = "data/reddit.Rdata")

# original
uncen_quantiles <- quantile(uncen$politics_score, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
mean(uncen$politics_score, na.rm = TRUE)
sd(uncen$politics_score, na.rm = TRUE)
ufc_quantiles <- quantile(ufc$politics_score, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
mean(ufc$politics_score, na.rm = TRUE)
sd(ufc$politics_score, na.rm = TRUE)
cen_quantiles <- quantile(cen$politics_score, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
mean(cen$politics_score, na.rm = TRUE)
sd(cen$politics_score, na.rm = TRUE)

china_irl_quantiles <- quantile(china_irl$politics_score, probs = c(0.1, 0.25, 0.5, 0.75, 0.9), na.rm = TRUE)
mean(china_irl$politics_score, na.rm = TRUE)
sd(china_irl$politics_score, na.rm = TRUE)
chonglangtv_quantiles <- quantile(chonglangtv$politics_score, probs = c(0.1, 0.25, 0.5, 0.75, 0.9), na.rm = TRUE)
mean(chonglangtv$politics_score, na.rm = TRUE)
sd(chonglangtv$politics_score, na.rm = TRUE)
DoubanGoosegroup_quantiles <- quantile(DoubanGoosegroup$politics_score, probs = c(0.1, 0.25, 0.5, 0.75, 0.9), na.rm = TRUE)
mean(DoubanGoosegroup$politics_score, na.rm = TRUE)
sd(DoubanGoosegroup$politics_score, na.rm = TRUE)

# keywords removed
uncen_quantiles_removed <- quantile(uncen$politics_score_removed, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
mean(uncen$politics_score_removed, na.rm = TRUE)
sd(uncen$politics_score_removed, na.rm = TRUE)
ufc_quantiles_removed <- quantile(ufc$politics_score_removed, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
mean(ufc$politics_score_removed, na.rm = TRUE)
sd(ufc$politics_score_removed, na.rm = TRUE)
cen_quantiles_removed <- quantile(cen$politics_score_removed, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
mean(cen$politics_score_removed, na.rm = TRUE)
sd(cen$politics_score_removed, na.rm = TRUE)

china_irl_quantiles_removed <- quantile(china_irl$politics_score_removed, probs = c(0.1, 0.25, 0.5, 0.75, 0.9), na.rm = TRUE)
mean(china_irl$politics_score_removed, na.rm = TRUE)
sd(china_irl$politics_score_removed, na.rm = TRUE)
chonglangtv_quantiles_removed <- quantile(chonglangtv$politics_score_removed, probs = c(0.1, 0.25, 0.5, 0.75, 0.9), na.rm = TRUE)
mean(chonglangtv$politics_score_removed, na.rm = TRUE)
sd(chonglangtv$politics_score_removed, na.rm = TRUE)
DoubanGoosegroup_quantiles_removed <- quantile(DoubanGoosegroup$politics_score_removed, probs = c(0.1, 0.25, 0.5, 0.75, 0.9), na.rm = TRUE)
mean(DoubanGoosegroup$politics_score_removed, na.rm = TRUE)
sd(DoubanGoosegroup$politics_score_removed, na.rm = TRUE)


# A bootstrap test
sample_mean_ufc <- c()
sample_median_ufc <- c()
sample_mean_cen <- c()
sample_median_cen <- c()
sample_mean_uncen <- c()
sample_median_uncen <- c()

sample_mean_ufc_removed <- c()
sample_median_ufc_removed <- c()
sample_mean_cen_removed <- c()
sample_median_cen_removed <- c()
sample_mean_uncen_removed <- c()
sample_median_uncen_removed <- c()

n_sample <- 2000
frac <- 0.005

for (i in 1:n_sample){
  sample_set <- ufc %>% sample_n(size = nrow(.) * frac, replace = TRUE)
  
  y <- sample_set %>% pull(politics_score)
  avg <- mean(y)
  med <- median(y)
  
  y_removed <- sample_set %>% pull(politics_score_removed)
  avg_removed <- mean(y_removed)
  med_removed <- median(y_removed)
  
  sample_mean_ufc <- c(sample_mean_ufc, avg)
  sample_median_ufc <- c(sample_median_ufc, med)
  
  sample_mean_ufc_removed <- c(sample_mean_ufc_removed, avg_removed)
  sample_median_ufc_removed <- c(sample_median_ufc_removed, med_removed)
}

for (i in 1:n_sample){
  sample_set <- cen %>% sample_n(size = nrow(.) * frac, replace = TRUE)
  
  y <- sample_set %>% pull(politics_score)
  avg <- mean(y)
  med <- median(y)
  
  y_removed <- sample_set %>% pull(politics_score_removed)
  avg_removed <- mean(y_removed)
  med_removed <- median(y_removed)
  
  sample_mean_cen <- c(sample_mean_cen, avg)
  sample_median_cen <- c(sample_median_cen, med)
  
  sample_mean_cen_removed <- c(sample_mean_cen_removed, avg_removed)
  sample_median_cen_removed <- c(sample_median_cen_removed, med_removed)
}

for (i in 1:n_sample){
  sample_set <- uncen %>% sample_n(size = nrow(.) * frac, replace = TRUE) 
  
  y <- sample_set %>% pull(politics_score)
  avg <- mean(y)
  med <- median(y)
  
  y_removed <- sample_set %>% pull(politics_score_removed)
  avg_removed <- mean(y_removed)
  med_removed <- median(y_removed)
  
  sample_mean_uncen <- c(sample_mean_uncen, avg)
  sample_median_uncen <- c(sample_median_uncen, med)
  
  sample_mean_uncen_removed <- c(sample_mean_uncen_removed, avg_removed)
  sample_median_uncen_removed <- c(sample_median_uncen_removed, med_removed)
}

# Plot 1
# Create a data frame with the sample means
sample_means <- data.frame(sample_mean = c(sample_mean_ufc, sample_mean_cen, sample_mean_uncen, 
                                           sample_mean_ufc_removed, sample_mean_cen_removed, sample_mean_uncen_removed),
                           group = rep(c('Uncensored from Censored', 'Censored', 'Uncensored',  
                                         'Uncensored from Censored No Keyword', 'Censored No Keyword', 'Uncensored No Keyword'), 
                                       each = length(sample_mean_ufc)),
                           group2 = rep(c('Uncensored Posts (Censored Users)', 'Censored Posts', 'Uncensored Posts (Uncensored Users)',  
                                          'Uncensored Posts (Censored Users)', 'Censored Posts', 'Uncensored Posts (Uncensored Users)'), 
                                        each = length(sample_mean_ufc)))

sample_means$group <- factor(sample_means$group, levels = c('Uncensored from Censored', 'Censored', 'Uncensored',  
                                      'Uncensored from Censored No Keyword', 'Censored No Keyword', 'Uncensored No Keyword'), labels = 1:6) 
sample_means$group2 <- as.factor(sample_means$group2)

# Create a density plot with ggplot2
p <- ggplot(data = sample_means, aes(x = sample_mean, fill = group, colour = group2)) + 
  scale_color_manual(values=c("#999999", "#0072B2", "#CD9B1D")) +
  geom_density(size = 0.8) +
  scale_fill_manual(values = c(alpha(c("#0072B2", "#999999", "#CD9B1D"), 0.85), 
                               alpha(c("#0072B2", "#999999", "#CD9B1D"), 0.25)), guide = "none") +
  labs(x = 'Political Sensitivity Score - Bootstrap Sample Mean', y = 'Density') +
  theme_bw() +
  guides(colour = guide_legend("Group of Posts")) +
  theme(legend.position = 'bottom', text = element_text(size = 14))
# ggtitle('Distribution of Bootstrapping Sample Means')

p <- p + annotate("segment", x = mean(cen$politics_score_removed), 
             xend = mean(cen$politics_score), 
             y = 50, yend = 50,
             arrow = arrow(ends = "both", angle = 90, length = unit(.15,"cm")),
             colour = "#999999") + 
  annotate("segment", x = mean(ufc$politics_score_removed), 
             xend = mean(ufc$politics_score), 
             y = 100, yend = 100,
             arrow = arrow(ends = "both", angle = 90, length = unit(.15,"cm")),
             colour = "#0072B2") + 
  annotate("segment", x = mean(uncen$politics_score_removed), 
             xend = mean(uncen$politics_score), 
             y = 410, yend = 410,
             arrow = arrow(ends = "both", angle = 90, length = unit(.15,"cm")),
             colour = "#CD9B1D")

# mean(cen$politics_score - cen$politics_score_removed)
# mean(ufc$politics_score - ufc$politics_score_removed)
# mean(uncen$politics_score - uncen$politics_score_removed)
p <- p + annotate("text", x = mean(cen$politics_score_removed + cen$politics_score)/2, y = 70,
                  colour = "#999999", label = paste("Diff = 0.078")) + 
  annotate("text", x = mean(ufc$politics_score_removed + ufc$politics_score)/2, y = 120,
           colour = "#0072B2", label = paste("Diff = 0.025")) + 
  annotate("text", x = 0.1, y = 425, 
           colour = "#CD9B1D", label = paste("Diff = 0.005"))

# Save the plot
ggsave('figs/f1b_weibo.pdf', p, width = 10, height = 5)



############ TEST 2: Petition Vs. Protest
load("data/petition.Rdata")

PATH <- "topic_analysis/output_petition/stm/"
load(paste0(PATH, "petition_Corpus.stm.RData"))
load(paste0(PATH, "stm_base_10.RData"))

petition_topic_reg <- lm(petition$politics_score[as.numeric(names(petition_Corpus.stm$documents$documents))] ~ stm_result_empty$theta)
petition_topic_reg_removed <- lm(petition$politics_score_removed[as.numeric(names(petition_Corpus.stm$documents$documents))] ~ stm_result_empty$theta)

petition_topic_sensitivity <- data.frame(Topic = 1:10, politics_score_mean = petition_topic_reg$coefficients[c(2:10, 1)],
                                         politics_score_mean_removed = petition_topic_reg_removed$coefficients[c(2:10, 1)])
petition_topic_sensitivity$politics_score_mean[1:9] <- petition_topic_sensitivity$politics_score_mean[1:9] + petition_topic_sensitivity$politics_score_mean[10]
petition_topic_sensitivity$politics_score_mean_removed[1:9] <- petition_topic_sensitivity$politics_score_mean_removed[1:9] + petition_topic_sensitivity$politics_score_mean_removed[10]
petition_topic_sensitivity$Topic_title <- c("Social Welfare (Rural)", "Public Transportation",
                                            "Infrastructure and Land Use (Rural)",
                                            "Urban Livelihood", "Relocation & Property",
                                            "Urban Property & Management", "Consumer Complaints",
                                            "Employment & Compensation", "Medical Care", "Education")
petition_topic_sensitivity$Topic_prop <- apply(stm_result_empty$theta, MARGIN = 2, FUN = mean)
petition_topic_sensitivity$Topic_keyword <- c("Hukou, Subsidy, Poverty Alleviation",
                                              "Urban Development, Public Transportation",
                                              "Village, Rural Land, Irrigation",
                                              "Urban Community, Trash, Noise",
                                              "Relocation, Compensation, Timetable",
                                              "Urban Community, Property, Fee",
                                              "Fee, Complaint, Company",
                                              "Company, Public Employee, Unpaid Wage",
                                              "Hospital, Elderly, Fee",
                                              "Child, Education Bureau, School")
petition_topic_sensitivity <- petition_topic_sensitivity[order(petition_topic_sensitivity$politics_score_mean, decreasing = TRUE),]
rownames(petition_topic_sensitivity) <- NULL
petition_topic_sensitivity$Source <- "Petition"

load("data/protest.Rdata")

PATH <- "topic_analysis/output_protest/stm/"
load(paste0(PATH, "protest_Corpus.stm.RData"))
load(paste0(PATH, "stm_base_10.RData"))

protest_topic_reg <- lm(protest$politics_score[as.numeric(names(protest_Corpus.stm$documents$documents))] ~ stm_result_empty$theta)
protest_topic_reg_removed <- lm(protest$politics_score_removed[as.numeric(names(protest_Corpus.stm$documents$documents))] ~ stm_result_empty$theta)

protest_topic_sensitivity <- data.frame(Topic = 1:10, politics_score_mean = protest_topic_reg$coefficients[c(2:10, 1)],
                                         politics_score_mean_removed = protest_topic_reg_removed$coefficients[c(2:10, 1)])
protest_topic_sensitivity$politics_score_mean[1:9] <- protest_topic_sensitivity$politics_score_mean[1:9] + protest_topic_sensitivity$politics_score_mean[10]
protest_topic_sensitivity$politics_score_mean_removed[1:9] <- protest_topic_sensitivity$politics_score_mean_removed[1:9] + protest_topic_sensitivity$politics_score_mean_removed[10]
protest_topic_sensitivity$Topic_title <- c("Real Estate Dispute (Quality)",
                                           "Medical Dispute (Violence & Police)",
                                           "Labor Strike",
                                           "Road Blockade",
                                           "Unpaid Salary (Migrant Worker)",
                                           "Police/Chengguan Brutality",
                                           "Forced Appropriation",
                                           "Public Gathering at Government Buildings",
                                           "Student and Resident Protest", 
                                           "Real Estate Dispute (Price Cut)")
protest_topic_sensitivity$Topic_prop <- apply(stm_result_empty$theta, MARGIN = 2, FUN = mean)
protest_topic_sensitivity$Topic_keyword <- c("Property, Fraud, Rights",
                                             "Police, Hospital, Disturbance",
                                             "Employee, Strike, Layoff",
                                             "Blockade, Disturbance, Traffic Jam",
                                             "Migrant Worker, Unpaid Wage, Construction",
                                             "Villager, Police/Chengguan, Violence",
                                             "Villager, Government, Demolition",
                                             "Government, Blockade, Petition",
                                             "Student, Protest, Parents",
                                             "Property, Price, Developer")
protest_topic_sensitivity <- protest_topic_sensitivity[order(protest_topic_sensitivity$politics_score_mean, decreasing = TRUE),]
rownames(protest_topic_sensitivity) <- NULL
protest_topic_sensitivity$Source <- "Protest"
# protest_topic_sensitivity$Topic_title <- paste(protest_topic_sensitivity$Topic,
#                                                protest_topic_sensitivity$Source)

protest_petition <- rbind(petition_topic_sensitivity, protest_topic_sensitivity)
protest_petition

p <- protest_petition %>% ggplot(aes(x = politics_score_mean, y = reorder(Topic_title, politics_score_mean), 
                             fill = Source)) + 
  geom_col() +
  scale_fill_manual(values = c(alpha(c("#0072B2", "#CD9B1D"), 0.75))) +
  guides(fill = guide_legend("Topic Source")) +
  geom_text(size = 3.7, aes(label = Topic_keyword), 
    ## make labels left-aligned
    nudge_x = 0.185) +
  xlim(c(0, 0.7)) + 
  xlab("Effect on Political Sensitivity") + ylab("Topic") +
  theme_bw() +
  theme(legend.position = 'bottom', text = element_text(size = 16))  

ggsave('figs/f2a_protest_petition.pdf', p, width = 10, height = 9.5)

p_removed <- protest_petition %>% ggplot(aes(x = politics_score_mean_removed, y = reorder(Topic_title, politics_score_mean_removed), 
                                     fill = Source)) + geom_col() +
  scale_fill_manual(values = c(alpha(c("#0072B2", "#CD9B1D"), 0.75))) +
  guides(fill = guide_legend("Topic Source")) +
  geom_text(size = 3.7, aes(label = Topic_keyword), 
            ## make labels left-aligned
            nudge_x = 0.185) +
  xlim(c(0, 0.7)) + 
  xlab("Effect on Political Sensitivity (Keywords Removed)") + ylab("Topic") +
  theme_bw() +
  theme(legend.position = 'bottom', text = element_text(size = 16))  

ggsave('figs/f2b_protest_petition.pdf', p_removed, width = 10, height = 9.5)

############ TEST 3: Zhihu

load("data/zhihu_COVID.Rdata")
PATH <- "topic_analysis/output_zhihu/stm/"
load(paste0(PATH, "zhihu_covid_Corpus.stm.RData"))
load(paste0(PATH, "stm_base_10.RData"))

zhihu_covid_use <- zhihu_covid[as.numeric(names(zhihu_covid_Corpus.stm$documents$documents)), ]
zhihu_covid_use$year_month <- paste(zhihu_covid_use$year, zhihu_covid_use$month)
zhihu_covid_use$comments_count_log <- log(zhihu_covid_use$comments_count + 1)
zhihu_covid_use$zans_count_log <- log(zhihu_covid_use$zans + 1)
zhihu_covid_use$influence_log <-  log(zhihu_covid_use$comments_count + zhihu_covid_use$zans + 1)
zhihu_covid_use$answer_length_log <- log(zhihu_covid_use$answer_length)

### Regression: Topic Analysis
# Temporal changes in the topic
# Make graphes
summary(lm(stm_result_empty$theta[, 2] ~ as.factor(zhihu_covid_use$year_month)))
summary(lm(stm_result_empty$theta[, 5] ~ as.factor(zhihu_covid_use$year_month)))
summary(lm(stm_result_empty$theta[, 6] ~ as.factor(zhihu_covid_use$year_month)))

zhihu_topic_reg <- lm(politics_score ~ stm_result_empty$theta[, ], data = zhihu_covid_use)
zhihu_topic_reg_removed <- lm(politics_score_removed ~ stm_result_empty$theta[,], data = zhihu_covid_use)

zhihu_topic_sensitivity <- data.frame(Topic = 1:10, politics_score_mean = zhihu_topic_reg$coefficients[c(2:10, 1)],
                                        politics_score_mean_removed = zhihu_topic_reg_removed$coefficients[c(2:10, 1)])
zhihu_topic_sensitivity$politics_score_mean[1:9] <- zhihu_topic_sensitivity$politics_score_mean[1:9] + zhihu_topic_sensitivity$politics_score_mean[10]
zhihu_topic_sensitivity$politics_score_mean_removed[1:9] <- zhihu_topic_sensitivity$politics_score_mean_removed[1:9] + zhihu_topic_sensitivity$politics_score_mean_removed[10]
zhihu_topic_sensitivity$Topic_title <- c("Family and Lifestyle", 
                                         "Political System & International Comparison",
                                         "COVID-19 Infection",
                                         "Economic & Social Impact",
                                         "Global Pandemic & International Comparison",
                                         "Shanghai Lockdown",
                                         "Medication & Vaccine",
                                         "Education & Student Life", "Hospitalization & Xi'an Lockdown", 
                                         "Complaint & Public Opinion")
zhihu_topic_sensitivity$Topic_keyword <- c("Child, Wuhan, Life, Friend, \n Parent, Family", 
                                           "State, Society, People, Government, \n West, Ideology, Democracy",
                                           "Virus, Infection, Death, Data, \n Symptoms, Omicron",
                                           "Work, Salary, Firm, Cost, \n Consumption, Sector",
                                           "United States, India, West, Taiwan, \n Trump, Outbreak",
                                           "Resident Community, Vegetable, Supply, \n Volunteer, Nucleic Acid Test, Delivery",
                                           "Vaccine, Virus, Mutation, Efficacy, \n Anti-Body, Chinese Medicine",
                                           "School, Student, Go-Home, Isolation, \n Teacher, Semester",
                                           "Hospital, Patient, Docter, \n Responsibility, Preganancy, Xi'an",
                                           "Problem, Possibility, News, \n Upvotes, Keyboard Warrior, Double-Standard")
zhihu_topic_sensitivity <- zhihu_topic_sensitivity[order(zhihu_topic_sensitivity$politics_score_mean, decreasing = TRUE),]
rownames(zhihu_topic_sensitivity) <- NULL


p <- zhihu_topic_sensitivity %>% ggplot(aes(x = politics_score_mean, y = reorder(Topic_title, politics_score_mean))) + 
  geom_col() + guides(fill = guide_legend("Topic Source")) +
  geom_text(size = 2, aes(label = Topic_keyword), 
            ## make labels left-aligned
            nudge_x = 0.225) +
  xlim(c(-0.05, 1.3)) + 
  xlab("Effect on Political Sensitivity") + ylab("Topic") +
  theme_bw() +
  theme(legend.position = 'bottom', text = element_text(size = 16))  

ggsave('figs/f3b_zhihu.pdf', p, width = 10, height = 6)


p <- zhihu_topic_sensitivity %>% ggplot(aes(x = politics_score_mean_removed, y = reorder(Topic_title, politics_score_mean_removed))) + 
  geom_col() + guides(fill = guide_legend("Topic Source")) +
  geom_text(size = 2, aes(label = Topic_keyword), 
            ## make labels left-aligned
            nudge_x = 0.225) +
  xlim(c(-0.05, 1.3)) + 
  xlab("Effect on Political Sensitivity") + ylab("Topic") +
  theme_bw() +
  theme(legend.position = 'bottom', text = element_text(size = 16))  

ggsave('figs/f3c_zhihu.pdf', p, width = 10, height = 6)
